PetCause Media Inc. - Analysis on Processed and Enriched Data

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
In [3]:
#import data
final = pd.read_csv("/Users/galenhancock/Desktop/Capstone/final_preprocessed3.csv", encoding='latin1', header=0)
In [194]:
final.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4549 entries, 0 to 4548
Data columns (total 76 columns):
UNID                       4549 non-null object
CM_status                  4549 non-null int64
CareManagement             4549 non-null int64
CareManagement_prev        4549 non-null int64
Count_of_pets              4549 non-null int64
Dental                     4549 non-null int64
Dental_prev                4549 non-null int64
Dental_status              4549 non-null int64
FleaTick                   4549 non-null int64
FleaTick_prev              4549 non-null int64
Fleatick_status            4549 non-null int64
HasMicrochip               4549 non-null int64
Heartwork_status           4549 non-null int64
Heartworm                  4549 non-null int64
Heartworm_prev             4549 non-null int64
Microchip                  4549 non-null int64
Microchip_prev             4549 non-null int64
Microchip_status           4549 non-null int64
Num_vaccines_bought        4549 non-null int64
PodioId                    4549 non-null object
PracticeDoctorID           4549 non-null float64
displayid                  4549 non-null int64
Age                        4549 non-null float64
Species_Bovine             4549 non-null uint8
Species_Camelid            4549 non-null uint8
Species_Canine             4549 non-null uint8
Species_Caprin             4549 non-null uint8
Species_Cavia              4549 non-null uint8
Species_Equine             4549 non-null uint8
Species_Feline             4549 non-null uint8
Species_Lagomo             4549 non-null uint8
Species_Lagomorp           4549 non-null uint8
Species_Mustelid           4549 non-null uint8
Species_Pocket P           4549 non-null uint8
Species_Porcine            4549 non-null uint8
Species_Poultry            4549 non-null uint8
Species_Reptile            4549 non-null uint8
Species_Rodent             4549 non-null uint8
content_Charity            4549 non-null float64
content_Cophraphagia       4549 non-null float64
content_Dental             4549 non-null float64
content_Flea               4549 non-null float64
content_General            4549 non-null float64
content_Heartworm          4549 non-null float64
content_Insurance          4549 non-null float64
content_Laser              4549 non-null float64
content_Microchip          4549 non-null float64
content_Nutrition          4549 non-null float64
content_Senior             4549 non-null float64
content_Surgery            4549 non-null float64
content_Weight             4549 non-null float64
content_Youth              4549 non-null float64
media_Adoptable            4549 non-null float64
media_Biography            4549 non-null float64
media_EduSponsored         4549 non-null float64
media_EduUnsponsored       4549 non-null float64
media_Funny                4549 non-null float64
media_Practice             4549 non-null float64
media_SponsorStatic        4549 non-null float64
media_SponsorVid           4549 non-null float64
media_Support              4549 non-null float64
media_Welcome              4549 non-null float64
species_All                4546 non-null float64
species_Cat                4546 non-null float64
species_Dog                4546 non-null float64
venue_WaitingRoom          4549 non-null float64
media_type_image           4549 non-null float64
media_type_video           4549 non-null float64
total_content_tags         4549 non-null int64
total_media_tags           4549 non-null int64
total_species_tags         4549 non-null int64
total_mediatype_tags       4549 non-null int64
total_ad_tags              4549 non-null int64
sum_prev                   4549 non-null int64
sum_current                4549 non-null int64
difference_before_after    4549 non-null int64
dtypes: float64(32), int64(27), object(2), uint8(15)
memory usage: 2.2+ MB
In [7]:
final = final.drop(['Breed', 'Birthday', 'Unnamed: 0', 'ClientNumber', 'Client_UNID', 'Client_UNID_Date', 'AssignDate', 'CreateDate', 'GivenDate', 'PetNumber', 'practice_client_UNID', 'tagId'], axis=1)
In [8]:
final.head()
Out[8]:
CM_status CareManagement CareManagement_prev Count_of_pets Dental Dental_prev Dental_status FleaTick FleaTick_prev Fleatick_status ... media_Practice media_SponsorStatic media_SponsorVid media_Support media_Welcome species_All species_Cat species_Dog venue_WaitingRoom Age
0 unchanged CAUTION CAUTION 25 BAD BAD unchanged BAD BAD unchanged ... 5 12 10 0 4 10 9 9 50 7.246945
1 unchanged CAUTION CAUTION 6 BAD BAD unchanged BAD BAD unchanged ... 24 47 48 0 23 63 42 28 234 5.864302
2 unchanged CAUTION CAUTION 5 BAD BAD unchanged BAD BAD unchanged ... 13 22 23 0 11 31 21 11 113 12.618719
3 unchanged CAUTION CAUTION 4 CAUTION OK worse BAD BAD unchanged ... 15 33 26 0 14 30 31 17 147 8.259971
4 unchanged CAUTION CAUTION 25 BAD BAD unchanged BAD BAD unchanged ... 23 31 31 0 15 46 29 11 159 6.001197

5 rows × 54 columns

In [9]:
final = pd.get_dummies(final, columns = ['Species'])
In [10]:
columns_ordered = ['UNID','CM_status', 'CareManagement', 'CareManagement_prev',
       'Count_of_pets', 'Dental', 'Dental_prev', 'Dental_status',
       'FleaTick', 'FleaTick_prev', 'Fleatick_status', 'HasMicrochip',
       'Heartwork_status', 'Heartworm', 'Heartworm_prev', 'Microchip',
       'Microchip_prev', 'Microchip_status', 'Num_vaccines_bought',
       'PodioId', 'PracticeDoctorID', 'displayid', 'Age', 'Species_Bovine',
       'Species_Camelid', 'Species_Canine', 'Species_Caprin',
       'Species_Cavia', 'Species_Equine', 'Species_Feline',
       'Species_Lagomo', 'Species_Lagomorp', 'Species_Mustelid',
       'Species_Pocket P', 'Species_Porcine', 'Species_Poultry',
       'Species_Reptile', 'Species_Rodent', 'content_Charity',
       'content_Cophraphagia', 'content_Dental', 'content_Flea',
       'content_General', 'content_Heartworm', 'content_Insurance',
       'content_Laser', 'content_Microchip', 'content_Nutrition',
       'content_Senior', 'content_Surgery', 'content_Weight',
       'content_Youth', 'media_Adoptable', 'media_Biography',
       'media_EduSponsored', 'media_EduUnsponsored', 'media_Funny',
       'media_Practice', 'media_SponsorStatic', 'media_SponsorVid',
       'media_Support', 'media_Welcome', 'species_All', 'species_Cat',
       'species_Dog', 'venue_WaitingRoom', 'media_type_image',
       'media_type_video']
In [11]:
final = final[columns_ordered]
In [12]:
#get sum of tags across each umbrella category, so that the percentage of each tag seen in each category can be calculated. 
#For example, in some cases, 20% of the media's content was dental related, while the remaining 80% was distributed across all 13 other content tags it could possess.
final['total_content_tags'] = final.iloc[:,38:52].sum(axis=1)
final['total_media_tags'] = final.iloc[:, 52:62].sum(axis=1)
final['total_species_tags'] = final.iloc[:, 62:65].sum(axis=1)
final['total_mediatype_tags'] = final.iloc[:, 66:68].sum(axis=1)
final['total_ad_tags'] = final.iloc[:, 38:66].sum(axis=1)
final['Count_of_pets'] = final['Count_of_pets'].astype(int)
In [14]:
final.head()
Out[14]:
UNID CM_status CareManagement CareManagement_prev Count_of_pets Dental Dental_prev Dental_status FleaTick FleaTick_prev ... species_Cat species_Dog venue_WaitingRoom media_type_image media_type_video total_content_tags total_media_tags total_species_tags total_mediatype_tags total_ad_tags
0 GA0412171-2799-42811 unchanged CAUTION CAUTION 25 BAD BAD unchanged BAD BAD ... 9 9 50 23 27 50 50 28 50 178
1 GA0412171-214-43552 unchanged CAUTION CAUTION 6 BAD BAD unchanged BAD BAD ... 42 28 234 104 130 234 234 133 234 835
2 GA0412171-11720-50344 unchanged CAUTION CAUTION 5 BAD BAD unchanged BAD BAD ... 21 11 113 47 66 113 113 63 113 402
3 GA0412171-4678-40061 unchanged CAUTION CAUTION 4 CAUTION OK worse BAD BAD ... 31 17 147 72 75 147 147 78 147 519
4 GA0412171-755-46496 unchanged CAUTION CAUTION 25 BAD BAD unchanged BAD BAD ... 29 11 159 68 91 159 159 86 159 563

5 rows × 73 columns

In [15]:
#standardize these columns
#average of ad tags in each category
final['content_Charity'] = (final['content_Charity']/final['total_content_tags'])*100
final['content_Cophraphagia'] = (final['content_Cophraphagia']/final['total_content_tags'])*100
final['content_Dental'] = (final['content_Dental']/final['total_content_tags'])*100
final['content_Flea'] = (final['content_Flea']/final['total_content_tags'])*100
final['content_General'] = (final['content_General']/final['total_content_tags'])*100
final['content_Heartworm'] = (final['content_Heartworm']/final['total_content_tags'])*100
final['content_Insurance'] = (final['content_Insurance']/final['total_content_tags'])*100
final['content_Laser'] = (final['content_Laser']/final['total_content_tags'])*100
final['content_Microchip'] = (final['content_Microchip']/final['total_content_tags'])*100
final['content_Nutrition'] = (final['content_Nutrition']/final['total_content_tags'])*100
final['content_Senior'] = (final['content_Senior']/final['total_content_tags'])*100
final['content_Surgery'] = (final['content_Surgery']/final['total_content_tags'])*100
final['content_Weight'] = (final['content_Weight']/final['total_content_tags'])*100
final['content_Youth'] = (final['content_Youth']/final['total_content_tags'])*100
final['media_Adoptable'] = (final['media_Adoptable']/final['total_media_tags'])*100
final['media_Biography'] = (final['media_Biography']/final['total_media_tags'])*100
final['media_EduSponsored'] = (final['media_EduSponsored']/final['total_media_tags'])*100
final['media_EduUnsponsored'] = (final['media_EduUnsponsored']/final['total_media_tags'])*100
final['media_Funny'] = (final['media_Funny']/final['total_media_tags'])*100
final['media_Practice'] = (final['media_Practice']/final['total_media_tags'])*100
final['media_SponsorStatic'] = (final['media_SponsorStatic']/final['total_media_tags'])*100
final['media_SponsorVid'] = (final['media_SponsorVid']/final['total_media_tags'])*100
final['media_Support'] = (final['media_Support']/final['total_media_tags'])*100
final['media_Welcome'] = (final['media_Welcome']/final['total_media_tags'])*100
final['species_All'] = (final['species_All']/final['total_species_tags'])*100
final['species_Cat'] = (final['species_Cat']/final['total_species_tags'])*100
final['species_Dog'] = (final['species_Dog']/final['total_species_tags'])*100
final['venue_WaitingRoom'] = (final['venue_WaitingRoom']/final['total_ad_tags'])*100
final['media_type_image'] = (final['media_type_image']/final['total_mediatype_tags'])*100
final['media_type_video'] = (final['media_type_video']/final['total_mediatype_tags'])*100
In [16]:
change_mapping = {'unchanged': 1,
                'worse': 0,
                'better': 2}
status_mapping = {'OK' : 2,
                 'CAUTION': 1,
                 'BAD': 0}

final['CM_status'] = final['CM_status'].map(change_mapping)
final['Dental_status'] = final['Dental_status'].map(change_mapping)
final['Heartwork_status'] = final['Heartwork_status'].map(change_mapping)
final['Microchip_status'] = final['Microchip_status'].map(change_mapping)
final['Fleatick_status'] = final['Fleatick_status'].map(change_mapping)

final['CareManagement'] = final['CareManagement'].map(status_mapping)
final['Dental'] = final['Dental'].map(status_mapping)
final['Heartworm'] = final['Heartworm'].map(status_mapping)
final['Microchip'] = final['Microchip'].map(status_mapping)
final['FleaTick'] = final['FleaTick'].map(status_mapping)

final['CareManagement_prev'] = final['CareManagement_prev'].map(status_mapping)
final['Dental_prev'] = final['Dental_prev'].map(status_mapping)
final['Heartworm_prev'] = final['Heartworm_prev'].map(status_mapping)
final['Microchip_prev'] = final['Microchip_prev'].map(status_mapping)
final['FleaTick_prev'] = final['FleaTick_prev'].map(status_mapping)
In [17]:
#sum current columns, and previous columns
final['sum_prev'] = final['CareManagement_prev'] + final['Dental_prev'] + final['FleaTick_prev'] + final['Heartworm_prev'] + final['Microchip']
final['sum_current'] = final['CareManagement'] + final['Dental'] + final['FleaTick'] + final['Heartworm'] + final['Microchip']

final['difference_before_after'] = final['sum_current'].astype(int) - final['sum_prev'].astype(int)
In [191]:
final.difference_before_after.describe()
Out[191]:
count    4549.000000
mean        0.001099
std         0.684191
min        -2.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         6.000000
Name: difference_before_after, dtype: float64
In [203]:
final.to_csv('final_with_integers.csv')
In [19]:
#not truly representative and will potentially render conclusions meaningless
final.difference_before_after.value_counts()
Out[19]:
 0    3569
-1     474
 1     217
 2     166
-2      91
 3      20
 4       9
 5       2
 6       1
Name: difference_before_after, dtype: int64
In [20]:
#build a representative datasetnew_df['all_ok'] = np.nan so that pets are at least 'OK' in one category
#dataset has to represent change as well as instances of all possible scorecard rankings for each of the below scorecard categories
CM_OK = final.loc[final['CareManagement'] == 2]
DENTAL_OK = final.loc[final['Dental'] == 2]
FLEATICK_OK = final.loc[final['FleaTick'] == 2]
MICROCHIP_OK = final.loc[final['Microchip'] == 2]
HEARTWORM_OK = final.loc[final['Heartworm'] == 2]

OK_df = CM_OK.append([DENTAL_OK, FLEATICK_OK, MICROCHIP_OK, HEARTWORM_OK])



OK_df = OK_df.drop_duplicates(keep='first')

print(OK_df.difference_before_after.value_counts())
 0    1988
-1     246
 1     169
 2     151
-2      58
 3      20
 4       9
 5       2
 6       1
Name: difference_before_after, dtype: int64
In [196]:
len(OK_df)
Out[196]:
2644
In [21]:
print(OK_df.Heartworm.value_counts())
print(OK_df.FleaTick.value_counts())
print(OK_df.Dental.value_counts())
print(OK_df.CareManagement.value_counts())
print(OK_df.Microchip.value_counts())
0    1216
1     751
2     677
Name: Heartworm, dtype: int64
0    1655
1     715
2     274
Name: FleaTick, dtype: int64
2    1576
0     845
1     223
Name: Dental, dtype: int64
1    2323
2     321
Name: CareManagement, dtype: int64
1    1590
2    1054
Name: Microchip, dtype: int64
In [198]:
OK_df.describe()
Out[198]:
CM_status CareManagement CareManagement_prev Count_of_pets Dental Dental_prev Dental_status FleaTick FleaTick_prev Fleatick_status ... media_type_image media_type_video total_content_tags total_media_tags total_species_tags total_mediatype_tags total_ad_tags sum_prev sum_current difference_before_after
count 2644.000000 2644.000000 2644.000000 2644.000000 2644.000000 2644.000000 2644.000000 2644.000000 2644.000000 2644.000000 ... 2644.000000 2644.000000 2644.000000 2644.000000 2644.000000 2644.000000 2644.000000 2644.000000 2644.000000 2644.000000
mean 1.001891 1.121407 1.119516 2.793116 1.276475 1.237897 1.016641 0.477685 0.472390 0.994327 ... 55.366320 44.633680 148.048033 148.048033 131.309758 148.048033 575.332073 4.986762 5.070348 0.083585
std 0.093267 0.326661 0.324456 3.233025 0.916263 0.923761 0.205181 0.675971 0.678011 0.351697 ... 8.874692 8.874692 519.360382 519.360382 482.995865 519.360382 2040.218776 1.402195 1.290396 0.788329
min 0.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 4.000000 2.000000 3.000000 -2.000000
25% 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000 ... 50.724638 39.647776 45.000000 45.000000 40.000000 45.000000 176.000000 4.000000 4.000000 0.000000
50% 1.000000 1.000000 1.000000 2.000000 2.000000 2.000000 1.000000 0.000000 0.000000 1.000000 ... 55.869104 44.130896 88.000000 88.000000 79.000000 88.000000 344.000000 5.000000 5.000000 0.000000
75% 1.000000 1.000000 1.000000 3.000000 2.000000 2.000000 1.000000 1.000000 1.000000 1.000000 ... 60.352224 49.275362 144.250000 144.250000 128.000000 144.250000 560.250000 6.000000 6.000000 0.000000
max 2.000000 2.000000 2.000000 41.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 ... 100.000000 100.000000 13792.000000 13792.000000 13072.000000 13792.000000 54448.000000 10.000000 10.000000 6.000000

8 rows × 74 columns

In [192]:
#with the OK_df dataframe, we have a representation of compliant and under-compliant pets
OK_df.difference_before_after.describe()
Out[192]:
count    2644.000000
mean        0.083585
std         0.788329
min        -2.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         6.000000
Name: difference_before_after, dtype: float64
In [26]:
OK_df.sum_current.value_counts()
Out[26]:
4     926
5     691
6     495
7     259
3     151
8      92
9      25
10      5
Name: sum_current, dtype: int64
In [165]:
cols = ['Age',
 'Count_of_pets',
 'Num_vaccines_bought',
 'Species_Canine',
 'Species_Feline',
 'content_Charity',
 'content_Cophraphagia',
 'content_Dental',
 'content_Flea',
 'content_General',
 'content_Heartworm',
 'content_Insurance',
 'content_Laser',
 'content_Microchip',
 'content_Nutrition',
 'content_Senior',
 'content_Surgery',
 'content_Weight',
 'content_Youth',
 'media_Biography',
 'media_EduSponsored',
 'media_EduUnsponsored',
 'media_Practice',
 'media_SponsorStatic',
 'media_SponsorVid',
 'media_Support',
 'media_Welcome',
 'media_Funny',
 'species_All',
 'species_Cat',
 'species_Dog',
 'venue_WaitingRoom',
 'media_type_image', 
 'media_type_video',
 'Dental',
 'Dental_prev',
 'Heartworm',                   
 'Heartworm_prev',
 'Microchip',
 'Microchip_prev',
 'FleaTick',
 'FleaTick_prev',
 'CareManagement',
 'CareManagement_prev']

 #'something_changed_better',
 #'something_changed_worse']

#Pearson product-moment correlation coefficients
        
cm = np.corrcoef(OK_df[cols].values.T)

# Get and show our heat map
plt.figure(figsize=(30,30))
sns.set(font_scale = 2)
hm = sns.heatmap(cm,
                 cbar=True,
                 annot=True,
                 square=True,
                 fmt='.1f',
                 annot_kws={'size': 10},
                 yticklabels=cols,
                 xticklabels=cols)


plt.tight_layout()
plt.show()

figure = hm.get_figure()    
figure.savefig('CorrelationMatrix.png', dpi=400)

#sns.plot.savefig("CorrelationMatrix.png")
In [32]:
#features = attributes of owner, pet, current scorecard, scorecard goal
#targets = types of content that should be shown

#our dataset must have instances of when something changed for the better, because that is our goal
In [33]:
all_cols_list = list(OK_df.columns.values)
print(all_cols_list)
['UNID', 'CM_status', 'CareManagement', 'CareManagement_prev', 'Count_of_pets', 'Dental', 'Dental_prev', 'Dental_status', 'FleaTick', 'FleaTick_prev', 'Fleatick_status', 'HasMicrochip', 'Heartwork_status', 'Heartworm', 'Heartworm_prev', 'Microchip', 'Microchip_prev', 'Microchip_status', 'Num_vaccines_bought', 'PodioId', 'PracticeDoctorID', 'displayid', 'Age', 'Species_Bovine', 'Species_Camelid', 'Species_Canine', 'Species_Caprin', 'Species_Cavia', 'Species_Equine', 'Species_Feline', 'Species_Lagomo', 'Species_Lagomorp', 'Species_Mustelid', 'Species_Pocket P', 'Species_Porcine', 'Species_Poultry', 'Species_Reptile', 'Species_Rodent', 'content_Charity', 'content_Cophraphagia', 'content_Dental', 'content_Flea', 'content_General', 'content_Heartworm', 'content_Insurance', 'content_Laser', 'content_Microchip', 'content_Nutrition', 'content_Senior', 'content_Surgery', 'content_Weight', 'content_Youth', 'media_Adoptable', 'media_Biography', 'media_EduSponsored', 'media_EduUnsponsored', 'media_Funny', 'media_Practice', 'media_SponsorStatic', 'media_SponsorVid', 'media_Support', 'media_Welcome', 'species_All', 'species_Cat', 'species_Dog', 'venue_WaitingRoom', 'media_type_image', 'media_type_video', 'total_content_tags', 'total_media_tags', 'total_species_tags', 'total_mediatype_tags', 'total_ad_tags', 'sum_prev', 'sum_current', 'difference_before_after']
In [36]:
#fitting different models for different ad types
#first is for distribution of content
#second is for distribution of media
#third is for distribution of species
#fourth is for distribution of media_type
X_cols_all = [
 'Age',
 'Dental',                           
 'Heartworm',             
 'Dental_prev',      
 'Heartworm_prev',
 'Microchip',
 'Microchip_prev',
 'FleaTick',
 'FleaTick_prev',
 'CareManagement',
 'CareManagement_prev',
 'Count_of_pets',
 'Species_Canine',
 'HasMicrochip']

X_cols_some = [
 'Dental',                           
 'Heartworm',
 'Microchip',
 'FleaTick',
 'CareManagement']


Y_cols_all = ['content_Charity',
 'content_Cophraphagia',
 'content_Dental',
 'content_Flea',
 'content_General',
 'content_Heartworm',
 'content_Insurance',
 'content_Laser',
 'content_Microchip',
 'content_Nutrition',
 'content_Senior',
 'content_Surgery',
 'content_Weight',
 'content_Youth',
 'media_Biography',
 'media_EduSponsored',
 'media_EduUnsponsored',
 'media_Practice',
 'media_SponsorStatic',
 'media_SponsorVid',
 'media_Support',
 'media_Welcome',
 'species_All',
 'species_Cat',
 'species_Dog',
 'media_type_video',
 'media_type_image']
              
content = ['content_Charity',
 'content_Cophraphagia',
 'content_Dental',
 'content_Flea',
 'content_General',
 'content_Heartworm',
 'content_Insurance',
 'content_Laser',
 'content_Microchip',
 'content_Nutrition',
 'content_Senior',
 'content_Surgery',
 'content_Weight',
 'content_Youth']

Y_col_Heartworm = ['Heartworm']
Y_col_Dental = ['Dental']
In [37]:
media = ['media_Biography',
 'media_EduSponsored',
 'media_EduUnsponsored',
 'media_Practice',
 'media_SponsorStatic',
 'media_SponsorVid',
 'media_Support',
 'media_Welcome',
 'media_Funny']
In [38]:
#species_All not being used due to high correlation with species_Dog
species = [
 'species_Cat',
 'species_Dog']
In [84]:
#media_type_video eliminated due to high correlation with media_type_image
medtype = ['media_type_image', 'media_type_video']
In [40]:
KMeans_content = ['content_Charity',
 'content_Cophraphagia',
 'content_Dental',
 'content_Flea',
 'content_General',
 'content_Heartworm',
 'content_Insurance',
 'content_Laser',
 'content_Microchip',
 'content_Nutrition',
 'content_Senior',
 'content_Surgery',
 'content_Weight',
 'content_Youth',
 'Heartworm']
In [41]:
#get two arrays of features for testing, one with more explanatory variables included.
X = OK_df[X_cols_all].values

X_some = OK_df[X_cols_some].values
In [42]:
Heartworm_outcome = OK_df['Heartworm'].values
FleaTick_outcome = OK_df['FleaTick'].values
Dental_outcome = OK_df['Dental'].values

sum_current_outcome = OK_df['sum_current'].values
difference = OK_df['difference_before_after'].values
In [158]:
#Create a Multinomial Logistic Regressor to classify Scorecard ratings based on input variables:
#This implementation is from sci-kit learn. We will use it to observe the model coefficients of the X variables (rate of ad type seen in the room during the course of the appointment)

import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss

#input into this function is the list of X columns desired and the y array
def classify_by_multilog(X_cols, y):
    

    X_train, X_test, y_train, y_test = train_test_split(OK_df[X_cols].values, y,
                                                        train_size=0.7)

    min_max_scaler = MinMaxScaler()
    X_train = min_max_scaler.fit_transform(X_train)
    X_test = min_max_scaler.transform(X_test)
    logit = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs')

    model = logit.fit(X_train, y_train)
   

    new_y = model.predict(X_test)

    pred = model.predict_proba(X_test)
    
    
    for x in range(len(model.coef_)):
       
        model_class = model.classes_[x]

        objects = X_cols
        y_pos = np.arange(len(objects))
        performance = model.coef_[x]

        plt.bar(y_pos, performance, align='center', alpha=0.5)
        plt.xticks(y_pos, objects, rotation = 'vertical')
        plt.ylabel('Value')
        plt.title('Coefficients of Independent Variables for Y = %i' %model_class)

        plt.show()
    
    print('Model Score: ', model.score(X_test, y_test))

    print('Log Loss: ', log_loss(y_test, pred))
    
    print(classification_report(y_test, new_y))

Below are visuals indicating the model coefficients when Y = 0 , 1, 2 for each Scorecard category of Heartworm, Dental, and Flea/Tick. There are four models, each with different features corresponding with media attributes from the "Content" type of tag, the "Media theme" type of tag, the "Media Type" type of tag, and the "Species Type" type of tag.

In [159]:
#Classify Heartworm outcomes based on content ad tags
classify_by_multilog(content, Heartworm_outcome)
Model Score:  0.517632241814
Log Loss:  1.01712375968
              precision    recall  f1-score   support

           0       0.51      0.91      0.65       362
           1       0.50      0.02      0.03       227
           2       0.56      0.39      0.46       205

    accuracy                           0.52       794
   macro avg       0.52      0.44      0.38       794
weighted avg       0.52      0.52      0.42       794

In [161]:
#Classify Dental outcome based on content ad tags
classify_by_multilog(content, Dental_outcome)
Model Score:  0.739294710327
Log Loss:  0.672955679647
              precision    recall  f1-score   support

           0       0.80      0.55      0.65       256
           1       0.00      0.00      0.00        60
           2       0.73      0.94      0.82       478

    accuracy                           0.74       794
   macro avg       0.51      0.49      0.49       794
weighted avg       0.70      0.74      0.70       794

In [162]:
#Classify Flea/Tick outcome based on content ad tags
classify_by_multilog(content, FleaTick_outcome)
Model Score:  0.625944584383
Log Loss:  0.849102487446
              precision    recall  f1-score   support

           0       0.63      0.99      0.77       501
           1       0.17      0.00      0.01       212
           2       0.20      0.02      0.04        81

    accuracy                           0.63       794
   macro avg       0.33      0.34      0.28       794
weighted avg       0.47      0.63      0.49       794

In [69]:
#Classify Heartworm outcome based on species ad tags
classify_by_multilog(species, Heartworm_outcome)
Model Score:  0.430730478589
Log Loss:  1.08109739553
              precision    recall  f1-score   support

           0       0.43      0.99      0.60       344
           1       0.00      0.00      0.00       235
           2       0.00      0.00      0.00       215

    accuracy                           0.43       794
   macro avg       0.14      0.33      0.20       794
weighted avg       0.19      0.43      0.26       794

/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
In [70]:
#Classify Dental outcome based on species ad tags
classify_by_multilog(species, Dental_outcome)
Model Score:  0.600755667506
Log Loss:  0.872140021993
              precision    recall  f1-score   support

           0       0.59      0.16      0.26       267
           1       0.00      0.00      0.00        64
           2       0.60      0.94      0.73       463

    accuracy                           0.60       794
   macro avg       0.40      0.37      0.33       794
weighted avg       0.55      0.60      0.51       794

In [71]:
#Classify Flea/tick outcome based on species ad tags
classify_by_multilog(species, FleaTick_outcome)
Model Score:  0.624685138539
Log Loss:  0.879531368089
              precision    recall  f1-score   support

           0       0.62      1.00      0.77       496
           1       0.00      0.00      0.00       217
           2       0.00      0.00      0.00        81

    accuracy                           0.62       794
   macro avg       0.21      0.33      0.26       794
weighted avg       0.39      0.62      0.48       794

/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
In [72]:
#Classify Heartworm outcome based on media ad tags
classify_by_multilog(media, Heartworm_outcome)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:947: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.
  "of iterations.", ConvergenceWarning)
Model Score:  0.47355163728
Log Loss:  1.03751532735
              precision    recall  f1-score   support

           0       0.47      0.95      0.63       361
           1       0.00      0.00      0.00       236
           2       0.52      0.17      0.25       197

    accuracy                           0.47       794
   macro avg       0.33      0.37      0.29       794
weighted avg       0.34      0.47      0.35       794

/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
In [73]:
#Classify Dental outcome based on media ad tags
classify_by_multilog(media, Dental_outcome)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:947: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.
  "of iterations.", ConvergenceWarning)
Model Score:  0.663727959698
Log Loss:  0.782868494468
              precision    recall  f1-score   support

           0       0.82      0.29      0.43       265
           1       0.33      0.02      0.03        65
           2       0.64      0.97      0.77       464

    accuracy                           0.66       794
   macro avg       0.60      0.42      0.41       794
weighted avg       0.68      0.66      0.60       794

In [74]:
#Classify Flea/tick outcome based on media ad tags
classify_by_multilog(media, FleaTick_outcome)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:947: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.
  "of iterations.", ConvergenceWarning)
Model Score:  0.614609571788
Log Loss:  0.886316627131
              precision    recall  f1-score   support

           0       0.62      0.99      0.76       489
           1       0.50      0.02      0.04       216
           2       0.00      0.00      0.00        89

    accuracy                           0.61       794
   macro avg       0.37      0.34      0.27       794
weighted avg       0.52      0.61      0.48       794

In [166]:
classify_by_multilog(medtype, Heartworm_outcome)
Model Score:  0.45717884131
Log Loss:  1.05977219825
              precision    recall  f1-score   support

           0       0.46      1.00      0.63       363
           1       0.00      0.00      0.00       225
           2       0.00      0.00      0.00       206

    accuracy                           0.46       794
   macro avg       0.15      0.33      0.21       794
weighted avg       0.21      0.46      0.29       794

/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
In [167]:
classify_by_multilog(medtype, Dental_outcome)
Model Score:  0.638539042821
Log Loss:  0.86029135014
              precision    recall  f1-score   support

           0       0.63      0.17      0.27       238
           1       0.00      0.00      0.00        70
           2       0.64      0.96      0.77       486

    accuracy                           0.64       794
   macro avg       0.42      0.38      0.35       794
weighted avg       0.58      0.64      0.55       794

/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
In [168]:
classify_by_multilog(medtype, FleaTick_outcome)
Model Score:  0.623425692695
Log Loss:  0.881222960177
              precision    recall  f1-score   support

           0       0.62      1.00      0.77       495
           1       0.00      0.00      0.00       216
           2       0.00      0.00      0.00        83

    accuracy                           0.62       794
   macro avg       0.21      0.33      0.26       794
weighted avg       0.39      0.62      0.48       794

/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
In [169]:
#ALL media tags at once:
cols_all = ['content_Charity',
 'content_Cophraphagia',
 'content_Dental',
 'content_Flea',
 'content_General',
 'content_Heartworm',
 'content_Insurance',
 'content_Laser',
 'content_Microchip',
 'content_Nutrition',
 'content_Senior',
 'content_Surgery',
 'content_Weight',
 'content_Youth',
 'media_Biography',
 'media_EduSponsored',
 'media_EduUnsponsored',
 'media_Practice',
 'media_SponsorStatic',
 'media_SponsorVid',
 'media_Support',
 'media_Welcome',
 'species_All',
 'species_Dog',
 'media_type_video']

classify_by_multilog(cols_all, Heartworm_outcome)
Model Score:  0.54282115869
Log Loss:  0.995659134208
              precision    recall  f1-score   support

           0       0.55      0.88      0.67       372
           1       0.40      0.14      0.21       227
           2       0.63      0.37      0.47       195

    accuracy                           0.54       794
   macro avg       0.52      0.46      0.45       794
weighted avg       0.52      0.54      0.49       794

In [170]:
classify_by_multilog(cols_all, Dental_outcome)
Model Score:  0.802267002519
Log Loss:  0.59486955326
              precision    recall  f1-score   support

           0       0.83      0.71      0.77       243
           1       0.00      0.00      0.00        57
           2       0.79      0.94      0.86       494

    accuracy                           0.80       794
   macro avg       0.54      0.55      0.54       794
weighted avg       0.75      0.80      0.77       794

In [172]:
classify_by_multilog(cols_all, FleaTick_outcome)
Model Score:  0.627204030227
Log Loss:  0.841942020163
              precision    recall  f1-score   support

           0       0.63      0.99      0.77       498
           1       0.33      0.01      0.03       215
           2       0.50      0.02      0.05        81

    accuracy                           0.63       794
   macro avg       0.49      0.34      0.28       794
weighted avg       0.54      0.63      0.50       794

In [212]:
#Create function that applied RandomForestRegressor to predict the values of ad rates for each "umbrella" ad group:
def evaluate_model(X_cols_list, columns_list):
    X_train, X_test, y_train, y_test = train_test_split(OK_df[X_cols_list].values, OK_df[columns_list].values,
                                                    train_size=0.6)

    min_max_scaler = MinMaxScaler()
    X_train = min_max_scaler.fit_transform(X_train)
    X_test = min_max_scaler.transform(X_test)
    

    max_depth = 30
    
    rand_for = RandomForestRegressor(max_depth = max_depth, min_samples_leaf = 3, bootstrap = True, oob_score=True, n_estimators = 50, random_state=1)
    rand_for.fit(X_train, y_train)

    y_rf = rand_for.predict(X_test)
    
    example_output = y_rf[0]
 
    
    
    import matplotlib.pyplot as plt
    ad_breakdown = pd.Series(example_output, columns_list)
    ad_breakdown.plot(x='Ad Type Breakdown by Percentage', kind = 'bar', rot =90, fontsize=10)
   
    print("Example distributions based on first predicted value from Test set.")
    print("Exmple input: ", X_test[0])
    plt.title('Example Distribution of Ad Types Recommended')
    
    plt.show()
    
    
    y_rf_train = rand_for.predict(X_train)

    print(f'MSE train score: {mean_squared_error(y_train, y_rf_train)}')
    print(f'MSE test score: {mean_squared_error(y_test, y_rf)}')

    print(f'R2 train score: {r2_score(y_train, y_rf_train)}')
    print(f'R2 test score: {r2_score(y_test, y_rf)}')
    print(f'Out-of-Bag test score: {rand_for.oob_score_}')

    importances = rand_for.feature_importances_
    


    #Display the feature importances in order of magnitude as a result of the Random Forest Regression model
    f_importances = pd.Series(importances, X_cols_list)

    # Sort the array in descending order of the importances
    f_importances.sort_values(ascending=False, inplace=True)

    # Make a bar Plot 
    f_importances.plot(x='Features', y='Importance', kind='bar', figsize=(16,9), rot=90, fontsize=30)
    plt.title('Feature Importance from Random Forest Regressor')

    plt.tight_layout()
    plt.show()
    
    

Below are the outputs of the Random Forest Regression models, in which the media types are the output variables. The goal of fitting a Random Forest Regression model was to "recommend" the distribution of content to be shown based on various features on the pet owner, pet compliance history, and pet compliance goals.

In [213]:
#With age and Count of Pets
predict_content = evaluate_model(X_cols_all, content)
predict_media = evaluate_model(X_cols_all, media)
predict_species = evaluate_model(X_cols_all, species)
predict_medtype = evaluate_model(X_cols_all, medtype)

print(predict_content)
print(predict_media)
print(predict_species)
print(predict_medtype)
Example distributions based on first predicted value from Test set.
Exmple input:  [ 0.05716385  1.          1.          1.          1.          0.          0.
  0.5         0.5         0.          0.          0.04878049  1.          0.        ]
MSE train score: 13.274156920847748
MSE test score: 25.963493235635195
R2 train score: 0.3923503777498638
R2 test score: -0.026240070791285097
Out-of-Bag test score: 0.026743940946895417
Example distributions based on first predicted value from Test set.
Exmple input:  [ 0.35633233  1.          0.5         1.          0.5         0.          0.
  0.          0.          0.          0.          0.02564103  1.          0.        ]
MSE train score: 15.51762335707936
MSE test score: 31.634758607783827
R2 train score: 0.42076484347298826
R2 test score: 0.03689499453500336
Out-of-Bag test score: 0.014697923063122405
Example distributions based on first predicted value from Test set.
Exmple input:  [ 0.37986774  1.          1.          1.          1.          1.          1.
  0.          0.          0.          0.          0.02564103  1.          1.        ]
MSE train score: 34.590674386964565
MSE test score: 73.69268338552271
R2 train score: 0.45408979472606964
R2 test score: 0.00882191141154498
Out-of-Bag test score: 0.00825316582607094
Example distributions based on first predicted value from Test set.
Exmple input:  [ 0.23751141  0.          0.5         0.          0.5         1.          1.
  0.          0.          0.          0.          0.04878049  1.          1.        ]
MSE train score: 39.46748319993298
MSE test score: 84.6039389407546
R2 train score: 0.49800430809174073
R2 test score: -0.07561956106862866
Out-of-Bag test score: -0.03374791719112369
None
None
None
None
In [62]:
#Without age and Count of Pets

#As seen in the correlation matrix as well, Dental scores appear to be most important feature and thus a model could be built for advertisement selection in order to influence 
predict_content2 = evaluate_model(X_cols_some, content)
predict_media2 = evaluate_model(X_cols_some, media)
predict_species2 = evaluate_model(X_cols_some, species)
predict_medtype2 = evaluate_model(X_cols_some, medtype)

print(predict_content2)
print(predict_media2)
print(predict_species2)
print(predict_medtype2)
MSE train score: 22.674087594442256
MSE test score: 26.293449376651683
R2 train score: 0.08483463570064213
R2 test score: 0.03560773586411508
MSE train score: 27.995472674432264
MSE test score: 28.62288109231723
R2 train score: 0.1097528334662548
R2 test score: 0.005051288613002031
MSE train score: 63.85055984699777
MSE test score: 59.60107736089323
R2 train score: 0.10481072868337221
R2 test score: 0.03777686114807549
/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:11: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  # This is added back by InteractiveShellApp.init_path()
MSE train score: 64.79936612061674
MSE test score: 91.11065900750913
R2 train score: 0.11533856774921003
R2 test score: 0.004016789185360969
None
None
None
None

Logit models below are fitted WITH 'Age' and owner's 'Count of pets' included in the feature vector.

In [130]:
#modeling just Score ~ Age + Count of Pets

cols = ['Age', 'Count_of_pets']
Ys = OK_df['Heartworm'].values
X = OK_df[cols].values

X_train, X_test, y_train, y_test = train_test_split(X, Ys,
                                                        train_size=0.7)

min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)

logit = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs')

model = logit.fit(X_train, y_train)

new_y = model.predict(X_test)

pred = model.predict_proba(X_test)    
from sklearn.metrics import classification_report
print(classification_report(y_test, new_y))
print('Model Score: ', model.score(X_test, y_test))

print('Log Loss: ', log_loss(y_test, pred))
              precision    recall  f1-score   support

           0       0.48      0.89      0.62       357
           1       0.48      0.28      0.36       226
           2       0.67      0.02      0.04       211

    accuracy                           0.48       794
   macro avg       0.54      0.40      0.34       794
weighted avg       0.53      0.48      0.39       794

Model Score:  0.48362720403
Log Loss:  1.04350106885
In [153]:
# model with features of Age, Count of pets, Heartworm previous score, Dental previous score, FleaTick previous score and target
# Post-appointment Heartworm scorecard value.

cols = ['Age', 'Count_of_pets', 'Heartworm_prev', 'Dental_prev', 'FleaTick_prev']
Ys = OK_df['Heartworm'].values
X = OK_df[cols].values

X_train, X_test, y_train, y_test = train_test_split(X, Ys,
                                                        train_size=0.7)

min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)
logit = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs')

model = logit.fit(X_train, y_train)

new_y = model.predict(X_test)

pred = model.predict_proba(X_test)    
from sklearn.metrics import classification_report
print(classification_report(y_test, new_y))
print('Model Score: ', model.score(X_test, y_test))

print('Log Loss: ', log_loss(y_test, pred))
              precision    recall  f1-score   support

           0       0.94      0.96      0.95       364
           1       0.80      0.78      0.79       236
           2       0.79      0.78      0.78       194

    accuracy                           0.86       794
   macro avg       0.84      0.84      0.84       794
weighted avg       0.86      0.86      0.86       794

Model Score:  0.862720403023
Log Loss:  0.442443269729
In [154]:
#Age of pet, owner's count of pets, and ad content as Independent Variables to CLASSIFY outcome of Heartworm Scorecard (0, 1, 2)

cols = ['Age', 'Count_of_pets', 'Heartworm_prev', 'Dental_prev', 'FleaTick_prev','content_Charity', 'content_Cophraphagia',
 'content_Dental',
 'content_Flea',
 'content_General',
 'content_Heartworm',
 'content_Insurance',
 'content_Laser',
 'content_Microchip',
 'content_Nutrition',
 'content_Senior',
 'content_Surgery',
 'content_Weight',
 'content_Youth']
Ys = OK_df['Dental'].values
X = OK_df[cols].values

X_train, X_test, y_train, y_test = train_test_split(X, Ys,
                                                        train_size=0.7)

min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)

logit = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs')

model = logit.fit(X_train, y_train)

new_y = model.predict(X_test)

pred = model.predict_proba(X_test)    
from sklearn.metrics import classification_report
print(classification_report(y_test, new_y))
print('Model Score: ', model.score(X_test, y_test))

print('Log Loss: ', log_loss(y_test, pred))
              precision    recall  f1-score   support

           0       0.84      0.60      0.70       262
           1       0.00      0.00      0.00        72
           2       0.72      0.94      0.81       460

    accuracy                           0.74       794
   macro avg       0.52      0.51      0.51       794
weighted avg       0.69      0.74      0.70       794

Model Score:  0.744332493703
Log Loss:  0.668188502188
In [155]:
#Age of pet, owner's count of pets, and ad content as Independent Variables to CLASSIFY outcome of Heartworm Scorecard (0, 1, 2)

cols = ['Age', 'Count_of_pets', 'Heartworm_prev', 'Dental_prev', 'FleaTick_prev', 'media_Biography',
 'media_EduSponsored',
 'media_EduUnsponsored',
 'media_Practice',
 'media_SponsorStatic',
 'media_SponsorVid',
 'media_Support',
 'media_Welcome']
Ys = OK_df['Dental'].values
X = OK_df[cols].values

X_train, X_test, y_train, y_test = train_test_split(X, Ys,
                                                        train_size=0.7)

min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)

logit = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs')

model = logit.fit(X_train, y_train)

new_y = model.predict(X_test)

pred = model.predict_proba(X_test)    
from sklearn.metrics import classification_report
print(classification_report(y_test, new_y))
print('Model Score: ', model.score(X_test, y_test))

print('Log Loss: ', log_loss(y_test, pred))
              precision    recall  f1-score   support

           0       0.85      0.21      0.33       272
           1       0.00      0.00      0.00        57
           2       0.63      0.98      0.77       465

    accuracy                           0.65       794
   macro avg       0.49      0.40      0.37       794
weighted avg       0.66      0.65      0.56       794

Model Score:  0.646095717884
Log Loss:  0.797622273668
In [156]:
#Age of pet, owner's count of pets, and ad content as Independent Variables to CLASSIFY outcome of Heartworm Scorecard (0, 1, 2)

cols = ['Age', 'Count_of_pets', 'Heartworm_prev', 'Dental_prev', 'FleaTick_prev', 'species_All','species_Dog']
Ys = OK_df['Dental'].values
X = OK_df[cols].values

X_train, X_test, y_train, y_test = train_test_split(X, Ys,
                                                        train_size=0.7)

min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)

logit = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs')

model = logit.fit(X_train, y_train)

new_y = model.predict(X_test)

pred = model.predict_proba(X_test)    
from sklearn.metrics import classification_report
print(classification_report(y_test, new_y))
print('Model Score: ', model.score(X_test, y_test))

print('Log Loss: ', log_loss(y_test, pred))
              precision    recall  f1-score   support

           0       0.64      0.12      0.20       246
           1       0.00      0.00      0.00        72
           2       0.62      0.97      0.76       476

    accuracy                           0.62       794
   macro avg       0.42      0.36      0.32       794
weighted avg       0.57      0.62      0.51       794

Model Score:  0.619647355164
Log Loss:  0.844812279306
/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
In [157]:
#Age of pet, owner's count of pets, and ad content as Independent Variables to CLASSIFY outcome of Heartworm Scorecard (0, 1, 2)

cols = ['Age', 'Count_of_pets', 'Heartworm_prev', 'Dental_prev', 'FleaTick_prev', 'media_type_image']
Ys = OK_df['Dental'].values
X = OK_df[cols].values

X_train, X_test, y_train, y_test = train_test_split(X, Ys,
                                                        train_size=0.7)

min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)

logit = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs')

model = logit.fit(X_train, y_train)

new_y = model.predict(X_test)

pred = model.predict_proba(X_test)    
from sklearn.metrics import classification_report
print(classification_report(y_test, new_y))
print('Model Score: ', model.score(X_test, y_test))

print('Log Loss: ', log_loss(y_test, pred))
              precision    recall  f1-score   support

           0       0.63      0.09      0.16       256
           1       0.00      0.00      0.00        55
           2       0.63      0.98      0.76       483

    accuracy                           0.63       794
   macro avg       0.42      0.36      0.31       794
weighted avg       0.58      0.63      0.52       794

Model Score:  0.625944584383
Log Loss:  0.824972251282
/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
In [ ]:
#Multi output regression and RandomForestRegressor information
#https://nealde.github.io/blog/2017/06/15/Random-Forest-Tutorial/